data = read.csv("day.csv")
str(data)
## 'data.frame': 731 obs. of 16 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 0 1 2 3 4 5 6 0 1 ...
## $ workingday: int 0 0 1 1 1 1 1 0 0 1 ...
## $ weathersit: int 2 2 1 1 1 1 2 2 1 1 ...
## $ temp : num 0.344 0.363 0.196 0.2 0.227 ...
## $ atemp : num 0.364 0.354 0.189 0.212 0.229 ...
## $ hum : num 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num 0.16 0.249 0.248 0.16 0.187 ...
## $ casual : int 331 131 120 108 82 88 148 68 54 41 ...
## $ registered: int 654 670 1229 1454 1518 1518 1362 891 768 1280 ...
## $ cnt : int 985 801 1349 1562 1600 1606 1510 959 822 1321 ...
All of the features are numeric, we will change the categorical ones to factors and assign meaningful levels:
data$season = as.factor(data$season)
levels(data$season) <- c("spring", "summer", "fall", "winter") # load it as a factor with levels
data$holiday = as.factor(data$holiday)
levels(data$holiday) = c("no", "yes") # load it as a factor with levels
data$workingday = as.factor(data$workingday)
levels(data$workingday) = c("no", "yes") # load it as a factor with levels
data$weathersit = as.factor(data$weathersit)
levels(data$weathersit) = c("Clearish", "Misty", "LightPrecip") # load it as a factor with levels
data$weekday = as.factor(data$weekday)
levels(data$weekday) = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") # load it as a factor with levels
data$mnth = as.factor(data$mnth)
levels(data$mnth) = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec") # load it as a factor with levels
data$yr = as.factor(data$yr)
levels(data$yr) = c("2011", "2012") # load it as a factor with levels
str(data)
## 'data.frame': 731 obs. of 16 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ season : Factor w/ 4 levels "spring","summer",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : Factor w/ 2 levels "2011","2012": 1 1 1 1 1 1 1 1 1 1 ...
## $ mnth : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday : Factor w/ 7 levels "Sun","Mon","Tue",..: 7 1 2 3 4 5 6 7 1 2 ...
## $ workingday: Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 2 1 1 2 ...
## $ weathersit: Factor w/ 3 levels "Clearish","Misty",..: 2 2 1 1 1 1 2 2 1 1 ...
## $ temp : num 0.344 0.363 0.196 0.2 0.227 ...
## $ atemp : num 0.364 0.354 0.189 0.212 0.229 ...
## $ hum : num 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num 0.16 0.249 0.248 0.16 0.187 ...
## $ casual : int 331 131 120 108 82 88 148 68 54 41 ...
## $ registered: int 654 670 1229 1454 1518 1518 1362 891 768 1280 ...
## $ cnt : int 985 801 1349 1562 1600 1606 1510 959 822 1321 ...
We note that although there are four levels to the data specified in the dataset description, only three of those levels actually occur in the data.
sum(is.na(data))
## [1] 0
There does not appear any missing data.
summary(data$cnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22 3152 4548 4504 5956 8714
hist(data$cnt, col = "lightslateblue", main = "Histogram of Total Ridership Count", xlab = "Total Ridership Count")
It seems sort of normally distributed, although since it is a count it should follow a Poisson distribution rather than normal.
pairs(data[,10:16])
It looks like there are relationships between temp and cnt, although not necessarily linear. The other relationships are not so clear.
cor(data[,10:16])
## temp atemp hum windspeed casual
## temp 1.0000000 0.9917016 0.12696294 -0.1579441 0.54328466
## atemp 0.9917016 1.0000000 0.13998806 -0.1836430 0.54386369
## hum 0.1269629 0.1399881 1.00000000 -0.2484891 -0.07700788
## windspeed -0.1579441 -0.1836430 -0.24848910 1.0000000 -0.16761335
## casual 0.5432847 0.5438637 -0.07700788 -0.1676133 1.00000000
## registered 0.5400120 0.5441918 -0.09108860 -0.2174490 0.39528245
## cnt 0.6274940 0.6310657 -0.10065856 -0.2345450 0.67280443
## registered cnt
## temp 0.5400120 0.6274940
## atemp 0.5441918 0.6310657
## hum -0.0910886 -0.1006586
## windspeed -0.2174490 -0.2345450
## casual 0.3952825 0.6728044
## registered 1.0000000 0.9455169
## cnt 0.9455169 1.0000000
summary(data[,10:13])
## temp atemp hum windspeed
## Min. :0.05913 Min. :0.07907 Min. :0.0000 Min. :0.02239
## 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200 1st Qu.:0.13495
## Median :0.49833 Median :0.48673 Median :0.6267 Median :0.18097
## Mean :0.49538 Mean :0.47435 Mean :0.6279 Mean :0.19049
## 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302 3rd Qu.:0.23321
## Max. :0.86167 Max. :0.84090 Max. :0.9725 Max. :0.50746
par(mfrow=c(2,2))
hist(data[,10], main="Temp", xlab = "Temperature (Celsius)", col = "tomato")
hist(data[,11], main="ATemp", xlab = "Normalized Temperature", col = "tomato2")
hist(data[,12], main="Humidity", xlab = "Humidity", col = "turquoise1")
hist(data[,13], main="Windspeed", xlab = "Windspeed", col = "skyblue")
These do not appear to be normally distributed, although Windspeed could possibly be turned into normal distributions with transformations and Humidity is fairly close to normal.
par(mfrow=c(1,3))
barplot(prop.table(table(data$season)), col = 1:4, main = "Distribution of Season", xlab = "Season", ylab = "Count")
barplot(prop.table(table(data$mnth)), col = 1:12, main = "Distribution of Months", xlab = "Month", ylab = "Count")
barplot(prop.table(table(data$weekday)), col = 1:12, main = "Distribution of Weekdays", xlab = "Weekday?", ylab = "Count")
barplot(prop.table(table(data$holiday)), col = 6:12, main = "Distribution of Holidays", xlab = "Holiday", ylab = "Count")
barplot(prop.table(table(data$workingday)), col = 8:12, main = "Distribution of Working Days", xlab = "Working Day", ylab = "Count")
barplot(prop.table(table(data$weathersit)), col = 10:12, main = "Distribution of Weather Types", xlab = "Weather Type", ylab = "Count")
It appears that Months, Seasons and Weekdays are distributed uniformly. However, there are very few holidays, more working days than non-working days and the weather seems to be mostly clear.
# create a color palette
palette(brewer.pal(n = 12, name = "Set3"))
par(mfrow=c(1,3))
barplot(aggregate(x = data$cnt, by = list(data$mnth), FUN = sum)$x, names = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), main = "Total Ridership By Month", col = 1:12)
palette(brewer.pal(n = 4, name = "Accent"))
barplot(aggregate(x = data$cnt, by = list(data$season), FUN = sum)$x, names = c("spring", "summer", "fall", "winter"), main = "Total Ridership By Season", col = 1:4)
palette(brewer.pal(n = 4, name = "Set1"))
barplot(aggregate(x = data$cnt, by = list(data$weathersit), FUN = sum)$x, names = c("Clearish", "Misty", "LightPrecip"), main = "Total Ridership By Weather Type", col = 1:4)
There appears to be relationships between the Month, Season and Weather and Total Ridership. However the Weather resembles the distribution of Weather, so this relationship may not be so important.
par(mfrow=c(1,3))
palette(brewer.pal(n = 4, name = "Set3"))
barplot(aggregate(x = data$cnt, by = list(data$holiday), FUN = sum)$x, names = c("No", "Yes"), main = "Total Ridership By Holiday", col = 1:12)
palette(brewer.pal(n = 4, name = "Accent"))
barplot(aggregate(x = data$cnt, by = list(data$workingday), FUN = sum)$x, names = c("No", "Yes"), main = "Total Ridership By Workingday", col = 1:4)
palette(brewer.pal(n = 7, name = "Set2"))
barplot(aggregate(x = data$cnt, by = list(data$weekday), FUN = sum)$x, names = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"), main = "Total Ridership By Weekday", col = 1:7)
While there is a higher ridership on non-holidays and workingdays, these very closely resemble the distributions of Holidays and Workingdays in the data, so these may not be very useful as predictors.
par(mfrow=c(1,3))
palette(brewer.pal(n = 6, name = "Dark2"))
plot(data$temp, data$cnt, main = "Temp vs Total Ridership", xlab = "Temp", ylab = "Total Ridership", col = 1)
plot(data$hum, data$cnt, main = "Humidity vs Total Ridership", ylab = "Total Ridership", xlab = "Humidity", col = 2)
plot(data$windspeed, data$cnt, main = "Windspeed vs Total Ridership", ylab = "Total Ridership", xlab = "Windspeed", col = 3)
It seems that there is a somewhat linear relationship between temperature and total ridership, although it may be more polynomial. It is not clear what the relationships between Humidity and Windspeed and Ridership are.
par(mfrow=c(1,3))
plot(data$windspeed, data$hum, main = "Windspeed vs Humidity", ylab = "Humidity", xlab = "Windspeed", col = 4)
plot(data$windspeed, data$temp, main = "Windspeed vs Temp", ylab = "Temp", xlab = "Windspeed", col = 5)
plot(data$hum, data$temp, main = "Humidity vs Temp", ylab = "Temp", xlab = "Humidity", col = 6)
It seems as if these numerical features may be related somehow, although they do not seem to be related linearly.
First we will look at boxplots of Total Ridership by the categorical features to see if anything stands out as worth exploring:
palette(brewer.pal(n = 12, name = "Set3"))
par(mfrow = c(1,2))
boxplot(cnt ~ weathersit, data = data, col = 1:4, main = "Count by Weather", ylab = "Total Ridership")
boxplot(cnt ~ season, data = data, col = 5:8, main = "Count by Season", ylab = "Total Ridership")
boxplot(cnt ~ mnth, data = data, col = 1:12, main = "Count by Month", ylab = "Total Ridership")
boxplot(cnt ~ weekday, data = data, col = 1:7, main = "Count by Weekday", ylab = "Total Ridership")
boxplot(cnt ~ workingday, data = data, col = 8:10, main = "Count by Working Day", ylab = "Total Ridership")
boxplot(cnt ~ holiday, data = data, col = 10:12, main = "Count by Holiday", ylab = "Total Ridership")
It appears that Weather, Season and Month may be worth further exploration, Weekday and Working Day don’t seem to be of much interest, and while the mean Ridership on Holidays is lower than on non-Holidays, it is not significantly lower.
We will now look at some interactions between the categorical features we identified above and some numerical features.
par(mfrow=c(1,2))
palette(brewer.pal(n = 4, name = "Set1"))
plot(data$temp, data$cnt, col = data$season, pch = 20, main = "Ridership vs Temp by Season", xlab = "Temperature", ylab = "Ridership")
legend("topleft", legend = c("Spring", "Summer", "Fall", "Winter"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)
plot(data$hum, data$cnt, col = data$season, pch = 20, main = "Ridership vs Humidity by Season", xlab = "Humidity", ylab = "Ridership")
legend("topleft", legend = c("Spring", "Summer", "Fall", "Winter"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)
As we would expect the temperature to be correlated to the season the Ridership vs Temperature plot doesn’t show any interesting patterns. However, the Ridership vs Humidity plot has vertical clusters which could indicate interesting correlations.
par(mfrow=c(1,2))
palette(brewer.pal(n = 4, name = "Set2"))
plot(data$temp, data$cnt, col = data$weathersit, pch = 20, main = "Ridership vs Temp by Weather", xlab = "Temperature", ylab = "Ridership")
legend("topleft", legend = c("Clear", "Misty", "Light Precip"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)
plot(data$hum, data$cnt, col = data$weathersit, pch = 20, main = "Ridership vs Humidity by Weather", xlab = "Humidity", ylab = "Ridership")
legend("topleft", legend = c("Clear", "Misty", "Light Precip"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)
Similarly, as we would expect Humidity to be correlated with Weather the Ridership vs Humidity plot doesn’t show any interesting patterns while the Ridership vs Temperature plot indicates that there may be some
par(mfrow = c(1,2))
palette(brewer.pal(n = 12, name = "Set3"))
plot(data$temp, data$cnt, col = data$mnth, pch = 20, main = "Ridership vs Temp by Month", xlab = "Temp", ylab = "Ridership")
plot(data$hum, data$cnt, col = data$mnth, pch = 20, main = "Ridership vs Humidity by Month", xlab = "Temp", ylab = "Ridership")
legend("topleft", legend = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), col = 1:12, lwd = 1, lty = c(0,0), pch = 20)
It is difficult to draw conclusions from the plots above, however both interaction may merit further investigation.
Lets remove non useful features:
data=data[,c(-1,-2,-14,-15)] # remove instance, date, causal, registered variable since those are not needed for our model.
# function to calculate leave out out cross validated rmse
calc_loocv_rmse = function(model) {
temp=(resid(model) / (1 - hatvalues(model)))^2
temp=temp[is.finite(temp)]
sqrt(mean(temp))
}
Lets separate numerical and categorical variable
numerical <- unlist(lapply(data, is.numeric)) # contains boolean value against each variable indicating whether that variable is a numeric or not
Lets first start by using only the numerical predictors to model the target variable
data_numerical= data[, numerical] # get the target and all the numerical columns
bike_mod_num = lm(cnt ~ ., data = data_numerical) # model with all numerical variables
summary(bike_mod_num)[["coefficients"]] # get the cofficeints
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3860.368 355.3890 10.8623754 1.426590e-25
## temp 2111.814 2282.1976 0.9253422 3.550954e-01
## atemp 5139.152 2576.9972 1.9942406 4.649933e-02
## hum -3149.110 383.9943 -8.2009286 1.081541e-15
## windspeed -4528.675 721.0854 -6.2803588 5.815379e-10
summary(bike_mod_num)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.460878
calc_loocv_rmse(bike_mod_num) # get the loocv rmse
## [1] 1456.515
par(mfrow = c(2, 2))
plot(bike_mod_num,col = 'dodgerblue') # do diagnostics
Findings:
The above results show that the temp is not very significant predictor as it has a high p value , this can be attributed to the fact that temp and atemp were highly correlated as we saw in the EDA and may be because of that, the effect of one gets eaten up by other.
The Multiple R-squared is quite low and does not discount for a good model.
The Fitted vs Residual plot shows a bit of non linear trend and the leverage plot also highlights some outliers.
We can expect there results to improve as we start using categorical columns with dummy variables.
The cross validated rmse values are quite high, so we might want to improve upon that.
Lets now use both the categorical and numerical predictor and see if we get some lift in the models performance.
bike_mod_all=lm(cnt~., data=data) # model with all the variables
summary(bike_mod_all)[["coefficients"]] # get the cofficeints
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1485.84392 239.74649 6.19756289 9.774832e-10
## seasonsummer 884.71081 179.49244 4.92895861 1.032402e-06
## seasonfall 832.70222 213.12943 3.90702600 1.024766e-04
## seasonwinter 1575.35064 181.00081 8.70355585 2.279871e-17
## yr2012 2019.73543 58.22037 34.69121792 2.294966e-154
## mnthFeb 131.02513 143.77618 0.91131318 3.624432e-01
## mnthMar 542.82773 165.43291 3.28125608 1.084575e-03
## mnthApr 451.16562 247.56686 1.82239911 6.881974e-02
## mnthMay 735.50548 267.62770 2.74824120 6.145473e-03
## mnthJun 515.40479 282.41068 1.82501876 6.842301e-02
## mnthJul 30.79664 313.82098 0.09813442 9.218536e-01
## mnthAug 444.94895 303.16514 1.46767848 1.426395e-01
## mnthSep 1004.17283 265.12279 3.78757646 1.651562e-04
## mnthOct 519.67428 241.55359 2.15138294 3.178651e-02
## mnthNov -116.69328 230.77620 -0.50565560 6.132572e-01
## mnthDec -89.59150 182.21449 -0.49168152 6.230982e-01
## holidayyes -589.69719 180.36226 -3.26951542 1.129889e-03
## weekdayMon 212.05386 109.49409 1.93666948 5.318673e-02
## weekdayTue 309.52763 107.13399 2.88916364 3.981733e-03
## weekdayWed 381.35742 107.47774 3.54824567 4.136266e-04
## weekdayThu 386.34297 107.52858 3.59293290 3.498183e-04
## weekdayFri 436.98377 107.43933 4.06726072 5.296026e-05
## weekdaySat 440.45884 106.56233 4.13334457 4.007227e-05
## weathersitMisty -462.53813 77.08692 -6.00021534 3.156422e-09
## weathersitLightPrecip -1965.08654 197.05173 -9.97244020 5.386219e-22
## temp 2855.01072 1398.15623 2.04198262 4.152649e-02
## atemp 1786.15738 1462.11957 1.22162196 2.222607e-01
## hum -1535.46844 292.44826 -5.25039352 2.013660e-07
## windspeed -2823.29668 414.55208 -6.81047529 2.091887e-11
summary(bike_mod_all)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.8423198
calc_loocv_rmse(bike_mod_all) # get the loocv rmse
## [1] 794.7767
par(mfrow = c(2, 2))
plot(bike_mod_all,col = 'dodgerblue') # do diagnostics
Findings:
Looks like this definitely helped in lifting the performance of the model as we now have a higher value of adjusted r-squared, but this comes at the cost at creating a complex model and the results show that not all variables are significant.
The residuals seems to have lost there normality.
The fitted vs residual plot shows a non linear trend and we also see presence of some extreme outlier.
The leverage plot also indicates presence of some outliers which we might have to check on as we go down the analysis.
The cross validated rmse has seen a huge drop from 1456 to 794 which is also a good indication
Based upon the results it would be good to test for the significance for few of the categorical variables which are listed below:
Month
Week Day
Working Day
bike_mod_w_month=lm(cnt~.-mnth, data=data) # model without month
bike_mod_w_weekday=lm(cnt~.-weekday, data=data) # model without weekday
bike_mod_w_workingday=lm(cnt~.-workingday, data=data) # model without workingday
anova(bike_mod_w_month,bike_mod_w_weekday,bike_mod_w_workingday,bike_mod_all) # do avova test
## Analysis of Variance Table
##
## Model 1: cnt ~ (season + yr + mnth + holiday + weekday + workingday +
## weathersit + temp + atemp + hum + windspeed) - mnth
## Model 2: cnt ~ (season + yr + mnth + holiday + weekday + workingday +
## weathersit + temp + atemp + hum + windspeed) - weekday
## Model 3: cnt ~ (season + yr + mnth + holiday + weekday + workingday +
## weathersit + temp + atemp + hum + windspeed) - workingday
## Model 4: cnt ~ season + yr + mnth + holiday + weekday + workingday + weathersit +
## temp + atemp + hum + windspeed
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 713 472452877
## 2 707 428442679 6 44010198 12.3957 2.683e-13 ***
## 3 702 415401688 5 13040991 4.4077 0.0005852 ***
## 4 702 415401688 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Findings:
The test shows that month, weekday are significant predictors hence we cant rule them out.Even though the month variable is statistically significant , it might be that just few levels are useful and rest of them does not help. We will use model selection schemes later to find that out.
According to test results working day variable seems to be non significant and we can rule out this variable.
Lets fit the model again, without the working day variable.
data_2= data[,c(-6)] # remove working day variable
bike_mod_all_2=lm(cnt~., data=data_2) # model with all remaining variable
summary(bike_mod_all_2)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.8423198
calc_loocv_rmse(bike_mod_all_2) # get the loocv rmse
## [1] 794.7767
This looks like a good starting point. We can now get started with identifying multi col linearity in the model, we will start looking at VIF to understand if we have multi col linearity.
library(faraway)
vif(bike_mod_all_2)
## seasonsummer seasonfall seasonwinter
## 7.496334 10.720030 7.455172
## yr2012 mnthFeb mnthMar
## 1.046828 1.835947 2.624298
## mnthApr mnthMay mnthJun
## 5.704404 6.868018 7.423137
## mnthJul mnthAug mnthSep
## 9.443506 8.813082 6.542133
## mnthOct mnthNov mnthDec
## 5.594951 4.956867 3.183722
## holidayyes weekdayMon weekdayTue
## 1.121296 1.821782 1.730242
## weekdayWed weekdayThu weekdayFri
## 1.741363 1.743011 1.740119
## weekdaySat weathersitMisty weathersitLightPrecip
## 1.725530 1.642310 1.338411
## temp atemp hum
## 80.806689 70.036722 2.140362
## windspeed
## 1.273296
As suspected before temp and atemp has high level of col linearity.Lets get the partial correlation coefficient for the temp variable and see if its useful for the model
temp_model=lm(temp~.-cnt, data=data_2)
cor(resid(bike_mod_all_2), resid(temp_model))
## [1] 2.286334e-16
Since the value is quite small , we can consider removing temp variable from the model. Although multi col linearity might not have much impact on the prediction but it does have an impact on the inference of the model.
data_3= data[,c(-6, -8)]
bike_mod_all_3=lm(cnt~., data=data_3)
summary(bike_mod_all_3)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.8416089
calc_loocv_rmse(bike_mod_all_3) # get the loocv rmse
## [1] 791.1807
Next we would want to also check for potential outliers , we have 3 ways of doing it :
We will be using cooks distance to identify any such outlier and see the effect of it on the model.
First lets calculate no of observation flagged by cooks distance:
sum(cooks.distance(bike_mod_all_3) > 4 / length(cooks.distance(bike_mod_all_3)))
## [1] 42
Next we can consider fitting a model after removing these observation
cokks_distance = cooks.distance(bike_mod_all_3)
bike_mod_all_4 = lm(cnt ~.,
data = data_3,
subset = cokks_distance <= 4 / length(cokks_distance))
summary(bike_mod_all_4)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9093198
calc_loocv_rmse(bike_mod_all_4) # get the loocv rmse
## [1] 584.7121
par(mfrow = c(2, 2))
plot(bike_mod_all_4,col = 'dodgerblue') # do diagnostics
Findings:
Removing these outliers helped get a lift in the adjusted r-squared from 84% to 90% which is a good thing.
The residuals start looking more normal.
The residual vs fitted plot still show some non linear pattern which indicates to the fact to try out higher order terms.
The leverage plot looks much neater.
The removal of outlier helped in improving the cross validated rmse from 794 to 584.
Lets include the interactions and see if it helps improve upon the models performance.
bike_mod_all_5 = lm(cnt ~.^2,
data = data_3,
subset = cokks_distance <= 4 / length(cokks_distance))
summary(bike_mod_all_5)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.946074
calc_loocv_rmse(bike_mod_all_5) # get the loocv rmse
## [1] 610.4657
par(mfrow = c(2, 2))
plot(bike_mod_all_5,col = 'dodgerblue') # do diagnostics
length(coef(bike_mod_all_5)) # get the number of params
## [1] 305
Findings:
We see a lift in the adjusted r squared from 90% to 94% which is a good thing
The residual vs fitted plot looks random with errors randomly distributed around 0 line, there are still some outliers which are getting highlighted on the plot.
The Q-Q plot looks more normal.
The leverage plot shows some indication of potential outliers.
The addition of interaction slightly increases the cross validated rmse value.
Next we will use a model selection criteria to rule out predictors that are not useful since after addition of interaction we have a lot of predictors.
bike_mod_all_6=step(bike_mod_all_5, trace=0, direction="backward")
summary(bike_mod_all_6)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9475997
calc_loocv_rmse(bike_mod_all_6) # get the loocv rmse
## [1] 482.5356
par(mfrow = c(2, 2))
plot(bike_mod_all_6,col = 'dodgerblue') # do diagnostics
length(coef(bike_mod_all_6)) # get the no of params
## [1] 127
Findings:
We were able to reduce the number of predictors from 305 to 127.
The reduction in no of predictors dint caused any compromise with the adjusted r-square value.
The error looks normal distributed.
The model selection again helped the dropping the cross validated rmse which previously increased to 482
Lets also check if addition polynomial features turns out useful for the model
temp_mod = lm(cnt ~ .+I(atemp^2)+I(hum^2)+I(windspeed^2),
data = data_3,
subset = cokks_distance <= 4 / length(cokks_distance))
bike_mod_all_7=step(temp_mod, trace=0, direction="backward")
summary(bike_mod_all_7)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9185862
calc_loocv_rmse(bike_mod_all_7) # get the loocv rmse
## [1] 554.8853
par(mfrow = c(2, 2))
plot(bike_mod_all_7,col = 'dodgerblue') # do diagnostics
Findings:
As compared to the adjusted r-squared value of 94% that we got using interaction we get a lower value of 91% using polynomial features.
The residual vs fitted plot does not look random and shows some non linear pattern.
This has caused an increase in cross validated rmse .
We will also want to check if the transformation of some variables helps improve the model.
temp_m = lm(log(cnt) ~.^2,
data = data_3,
subset = cokks_distance <= 4 / length(cokks_distance))
bike_mod_all_8=step(temp_m, trace=0, direction="backward")
summary(bike_mod_all_8)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9376945
par(mfrow = c(2, 2))
plot(bike_mod_all_8,col = 'dodgerblue') # do diagnostics
length(coef(bike_mod_all_8)) # get the number of params
## [1] 114
Findings: